Environment Setting
suppressMessages({
library(data.table)
library(dplyr)
library(plotly)
library(abn)
})Data Acquisition
var_path <- "data/20230311_tips.csv"
data <- var_path %>% fread()Pre-proccess
data <- data %>% mutate(
sex = sex %>% factor(., c("Female", "Male")),
smoker = smoker %>% factor(., c("No", "Yes")),
day = day %>% factor(., c("Thur", "Fri", "Sat", "Sun")),
time = time %>% factor(., c("Lunch", "Dinner"))
)Check Data
data %>% skimr::skim()| Name | Piped data |
| Number of rows | 244 |
| Number of columns | 7 |
| Key | NULL |
| _______________________ | |
| Column type frequency: | |
| factor | 4 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| sex | 0 | 1 | FALSE | 2 | Mal: 157, Fem: 87 |
| smoker | 0 | 1 | FALSE | 2 | No: 151, Yes: 93 |
| day | 0 | 1 | FALSE | 4 | Sat: 87, Sun: 76, Thu: 62, Fri: 19 |
| time | 0 | 1 | FALSE | 2 | Din: 176, Lun: 68 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| total_bill | 0 | 1 | 19.79 | 8.90 | 3.07 | 13.35 | 17.8 | 24.13 | 50.81 | ▃▇▃▁▁ |
| tip | 0 | 1 | 3.00 | 1.38 | 1.00 | 2.00 | 2.9 | 3.56 | 10.00 | ▇▆▂▁▁ |
| size | 0 | 1 | 2.57 | 0.95 | 1.00 | 2.00 | 2.0 | 3.00 | 6.00 | ▇▂▂▁▁ |
Visualization
3D Scatter
tip ~ size + total_bill
data %>%
plot_ly(x = ~size, y = ~total_bill, z = ~tip, type = "scatter3d", mode = "markers")tip ~ size + total_bill + smoker
data %>%
plot_ly(x = ~size, y = ~total_bill, z = ~tip, color = ~smoker, type = "scatter3d", mode = "markers")Scatter with lm
tip ~ size + total_bill + day +smoker + sex
ggplotly(
data %>%
mutate(
day = day %>% paste("day =", .),
smoker = smoker %>% paste("smoker =", .)
) %>%
ggplot(aes(x = total_bill, y = tip, color = sex)) +
geom_smooth(method = "lm", se = F, size = .5, linetype = 2, formula = "y ~ x") +
geom_point(alpha = .8) +
facet_wrap(smoker ~ day, ncol = 4, scales = "free") +
scale_x_continuous(labels = scales::label_comma()) +
scale_y_continuous(labels = scales::label_comma()) +
colorspace::scale_color_discrete_diverging("Green-Orange") +
colorspace::scale_fill_discrete_diverging("Green-Orange") +
ggthemes::theme_fivethirtyeight() +
theme(legend.position = "bottom")
)Box plot
Scatter with lm: tip ~ total_bill + day + sex
ggplotly(data %>%
mutate(
day = day %>% paste("day =", .),
smoker = smoker %>% paste("smoker =", .)
) %>%
ggplot(aes(x = sex, y = tip, color = sex)) +
geom_boxplot() +
facet_wrap(~day, ncol = 4, scales = "free") +
scale_y_continuous(labels = scales::label_comma()) +
colorspace::scale_color_discrete_diverging("Green-Orange") +
colorspace::scale_fill_discrete_diverging("Green-Orange") +
ggthemes::theme_fivethirtyeight() +
theme(legend.position = "bottom"))Hypothesis testing
Structure Discover
data_abn <- data %>%
mutate(day = forcats::fct_recode(day, "Weekday" = "Thur", "Weekday" = "Fri", "Weekend" = "Sat", "Weekend" = "Sun")) %>%
mutate_at(.vars = c("total_bill", "tip"), ~ log(.)) %>%
data.frame()
data.dists <- list(
"total_bill" = "gaussian",
"tip" = "gaussian",
"sex" = "binomial",
"smoker" = "binomial",
"day" = "binomial",
"time" = "binomial",
"size" = "gaussian"
)
data_abn %>%
buildScoreCache(data.df = ., data.dists = data.dists, max.parents = 1) %>%
mostProbable() %>%
fitAbn() %>%
plot()## Step1. completed max alpha_i(S) for all i and S
## Total sets g(S) to be evaluated over: 128
Regression
list_model <- c("total_bill", "size", "total_bill + size") %>%
paste("tip ~", .) %>%
c(., "size ~ total_bill") %>%
lapply(., as.formula) %>%
lapply(., function(x) {
glm(data = data_abn, formula = x)
})
suppressMessages({
list_model %>%
jtools::export_summs(.,
error_format = "[{conf.low}, {conf.high}]",
model.names = c("total bill", "size", "both", "size by total bill")
)
})| total bill | size | both | size by total bill | |
|---|---|---|---|---|
| (Intercept) | -0.95 *** | 0.44 *** | -0.89 *** | -1.12 *** |
| [-1.22, -0.68] | [0.30, 0.57] | [-1.16, -0.61] | [-1.77, -0.48] | |
| total_bill | 0.68 *** | 0.60 *** | 1.28 *** | |
| [0.58, 0.77] | [0.49, 0.72] | [1.06, 1.50] | ||
| size | 0.22 *** | 0.06 * | ||
| [0.17, 0.27] | [0.00, 0.11] | |||
| N | 244 | 244 | 244 | 244 |
| AIC | 141.35 | 228.26 | 138.81 | 568.73 |
| BIC | 151.85 | 238.75 | 152.80 | 579.23 |
| Pseudo R2 | 0.67 | 0.34 | 0.68 | 0.37 |
| *** p < 0.001; ** p < 0.01; * p < 0.05. | ||||
Plot model fitness
total_bill
par(mfrow = c(2, 2))
list_model[[1]]%>%plot()both
par(mfrow = c(2, 2))
list_model[[3]]%>%plot()